package com.hrhx.springboot.crawler;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import com.hrhx.springboot.domain.ImookCourse;
import com.hrhx.springboot.mysql.repository.ImookCourseRepository;
import com.hrhx.springboot.util.SpringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author duhongming
 * @version 1.0
 * @description
 * 免费课程 http://www.imooc.com/course/list
 *
 * @date 2019-11-30 08:27
 */
public class ImookCourseCrawler extends BreadthCrawler {

    private ImookCourseRepository repository;

    private static final String FILTER_NUMBER = "\\d+";
    public static final Pattern FILTER_NUMBER_PATTERN = Pattern.compile(FILTER_NUMBER);

    public ImookCourseCrawler(String crawlPath, boolean autoParse) throws Exception {
        super(crawlPath, autoParse);
        repository = SpringUtil.getBean(ImookCourseRepository.class);
        this.addSeed("https://www.imooc.com/course/list", "direction");
        this.setThreads(1);
        this.start(4);
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {

    }

    @MatchType(types = "direction")
    public void handleDirection(Page page, CrawlDatums next) {
        Elements directions = page.select("div.course-content > div > div:nth-child(1) > div > ul > li > a");
        for (int i = 1; i < directions.size(); i++) {
            String url = directions.get(i).attr("abs:href");
            String directionCode = directions.get(i).attr("data-ct");
            String directionName = directions.get(i).text();
            CrawlDatum crawlDatum = new CrawlDatum(url, "classify");
            crawlDatum.meta("directionCode", directionCode);
            crawlDatum.meta("directionName", directionName);
            next.add(crawlDatum);
        }
    }

    @MatchType(types = "classify")
    public void handleClassify(Page page, CrawlDatums next) {
        Elements classifys = page.select("div.course-content div.js-course-skills > div > ul > li > a");
        for (int i = 1; i < classifys.size(); i++) {
            String url = classifys.get(i).attr("abs:href");
            String classifyCode = url.replace("https://www.imooc.com/course/list?c=", "");
            String classifyName = classifys.get(i).text();
            CrawlDatum crawlDatum = new CrawlDatum(url, "page");
            crawlDatum.meta("directionCode", page.meta("directionCode"));
            crawlDatum.meta("directionName", page.meta("directionName"));
            crawlDatum.meta("classifyCode", classifyCode);
            crawlDatum.meta("classifyName", classifyName);
            next.add(crawlDatum);
        }
    }

    @MatchType(types = "page")
    public void handlePage(Page page, CrawlDatums next) {
        String pageNum = page.selectText("em.pager-total");
        for (int i = 1; i <= Integer.parseInt(pageNum); i++) {
            String url = page.url() + "&page=" + i;
            CrawlDatum crawlDatum = new CrawlDatum(url, "page-list");
            crawlDatum.meta("directionCode", page.meta("directionCode"));
            crawlDatum.meta("directionName", page.meta("directionName"));
            crawlDatum.meta("classifyCode", page.meta("classifyCode"));
            crawlDatum.meta("classifyName", page.meta("classifyName"));
            next.add(crawlDatum);
        }
    }

    @MatchType(types = "page-list")
    public void handlePageList(Page page, CrawlDatums crawlDatums) {
        Date date = new Date();

        Elements courseContainer = page.select("div.course-card-container");
        for (Element element : courseContainer) {

            ImookCourse course = new ImookCourse();

            course.setDirectionCode(page.meta("directionCode"));
            course.setDirectionName(page.meta("directionName"));

            course.setClassifyCode(page.meta("classifyCode"));
            course.setClassifyName(page.meta("classifyName"));

            String courseDifficulty = element.select("div.course-card-content div.course-card-info > span:nth-child(1)").text();
            course.setCourseDifficulty(courseDifficulty);

            String courseTitle = element.select("div.course-card-content > h3").text();
            course.setCourseTitle(courseTitle);

            String courseDesc = element.select("div.course-card-content p.course-card-desc").text();
            course.setCourseDesc(courseDesc);

            String courseStudents = element.select("div.course-card-content div.course-card-info > span:nth-child(2)").text();
            course.setCourseStudents(Integer.parseInt(courseStudents));

            String url = element.select("a").attr("abs:href");
            course.setCourseUrl(url);

            Matcher matcher = FILTER_NUMBER_PATTERN.matcher(url);
            if (matcher.find()) {
                course.setCourseId(Integer.parseInt(matcher.group()));
            }

            course.setCmpTime(date);

            repository.save(course);
        }

    }
}
